Digitised books#
import pandas as pd
import altair as alt
from IPython.display import HTML
alt.data_transformers.disable_max_rows()
DataTransformerRegistry.enable('default')
df = pd.read_csv("https://raw.githubusercontent.com/GLAM-Workbench/trove-books-data/main/trove-books.csv", keep_default_na=False)
df.shape
(21218, 24)
df.loc[df["text_file"] != ""].shape
(17692, 24)
df["rights"].str.split(" | ", regex=False).explode().value_counts()
rights
http://rightsstatements.org/vocab/NKC/1.0/ 17300
Out of Copyright 15848
No known copyright restrictions 4492
433
In Copyright 397
Perpetual 46
Mixed copyright 2
Out of copyright 2
http://creativecommons.org/licenses/by-nc-nd/3.0/au 1
Name: count, dtype: int64
languages = df["language"].str.split(" | ", regex=False).explode()
languages.value_counts()[:10]
language
English 17903
Chinese 1685
518
French 256
Undetermined 162
German 132
Japanese 95
Italian 89
Austronesian (Other) 80
Dutch 77
Name: count, dtype: int64
languages.nunique()
251
language_counts = languages.value_counts().to_dict()
del(language_counts["English"])
from wordcloud import WordCloud
wc = WordCloud(width=800, height=400)
wc.generate_from_frequencies(language_counts)
wc.to_image()
df["pages"]
0 130
1 24
2 24
3 65
4 246
...
21213 86
21214 52
21215 60
21216 6
21217 44
Name: pages, Length: 21218, dtype: int64
df["year"] = df["date"].str.extract(r"\b((?:16|17|18|19|20)\d{2})\b")
year_counts = df["year"].value_counts().to_frame().reset_index()
alt.Chart(year_counts).mark_bar(size=1).encode(
x="year:T",
y="count:Q"
).properties(width=800, height=200)
alt.Chart(df).mark_bar().encode(
x=alt.X("pages:Q").bin(),
y="count()"
).properties(width=800, height=200)